Executive Summary

This analysis evaluates Meridian Health Systems’ “Thrive at Work” wellness program using machine learning techniques to:

  1. Predict program success using classification models (Random Forest & SVM)
  2. Segment employees into behavioral profiles using clustering (K-means & K-medoids)

1. Data Loading and Preparation

1.1 Load Required Libraries

# Data manipulation
library(tidyverse)
library(data.table)

# Machine Learning
library(caret)
library(randomForest)
library(e1071)
library(cluster)

# Visualization
library(ggplot2)
library(GGally)
library(corrplot)
library(gridExtra)
library(factoextra)
library(patchwork)

# Model evaluation
library(pROC)
library(ROCR)

# Set seed for reproducibility
set.seed(42)

1.2 Load Data

# Load the data
wellness_data <- read.csv("data/workplace_wellness_data.csv", stringsAsFactors = FALSE)

# Display structure
str(wellness_data)
## 'data.frame':    1000 obs. of  18 variables:
##  $ employee_id          : chr  "EMP_0001" "EMP_0002" "EMP_0003" "EMP_0004" ...
##  $ age                  : int  37 28 45 41 56 44 33 26 48 38 ...
##  $ gender               : chr  "Male" "Male" "Female" "Female" ...
##  $ department           : chr  "Sales" "Operations" "Administration" "Sales" ...
##  $ years_employed       : num  8 9.4 7.1 3.4 0.5 7 2.2 3.4 0.5 6.6 ...
##  $ bmi                  : num  21.4 31 24.6 34.1 27.4 18 25.9 31.8 23.2 30 ...
##  $ bp_systolic          : int  123 148 149 164 141 122 137 129 132 151 ...
##  $ resting_hr           : int  67 70 58 70 88 63 75 65 66 80 ...
##  $ baseline_activity_hrs: num  1.3 0.8 0.4 1.9 1.6 5.6 0.9 0.4 0.6 0.8 ...
##  $ sleep_quality        : num  2.5 3.2 8.8 4 6 7.7 6 4.3 6 5.7 ...
##  $ stress_score         : num  9.8 8 6.5 7.9 7 3.1 4.4 8.9 6.7 4.8 ...
##  $ job_satisfaction     : num  6 6.8 4.6 3.6 6.5 10 6.2 4.3 6.6 6.5 ...
##  $ social_support       : num  8.5 9.6 9.9 7.4 5.9 6.9 7.7 6 8.6 4.7 ...
##  $ self_efficacy        : num  6.3 6.2 4.4 6.6 7.9 6.1 5.2 8.8 6.2 5.8 ...
##  $ sessions_attended    : int  7 5 7 5 8 11 6 11 6 6 ...
##  $ app_engagement       : num  27.4 25.6 28.5 69.6 55.8 39.7 46.9 89.3 72 38.9 ...
##  $ peer_challenge       : int  0 0 0 0 0 0 1 0 1 1 ...
##  $ program_success      : int  1 0 1 1 0 1 1 0 1 1 ...
# Display first few rows
head(wellness_data)
##   employee_id age gender     department years_employed  bmi bp_systolic
## 1    EMP_0001  37   Male          Sales            8.0 21.4         123
## 2    EMP_0002  28   Male     Operations            9.4 31.0         148
## 3    EMP_0003  45 Female Administration            7.1 24.6         149
## 4    EMP_0004  41 Female          Sales            3.4 34.1         164
## 5    EMP_0005  56 Female  Manufacturing            0.5 27.4         141
## 6    EMP_0006  44   Male             IT            7.0 18.0         122
##   resting_hr baseline_activity_hrs sleep_quality stress_score job_satisfaction
## 1         67                   1.3           2.5          9.8              6.0
## 2         70                   0.8           3.2          8.0              6.8
## 3         58                   0.4           8.8          6.5              4.6
## 4         70                   1.9           4.0          7.9              3.6
## 5         88                   1.6           6.0          7.0              6.5
## 6         63                   5.6           7.7          3.1             10.0
##   social_support self_efficacy sessions_attended app_engagement peer_challenge
## 1            8.5           6.3                 7           27.4              0
## 2            9.6           6.2                 5           25.6              0
## 3            9.9           4.4                 7           28.5              0
## 4            7.4           6.6                 5           69.6              0
## 5            5.9           7.9                 8           55.8              0
## 6            6.9           6.1                11           39.7              0
##   program_success
## 1               1
## 2               0
## 3               1
## 4               1
## 5               0
## 6               1
# Summary statistics
summary(wellness_data)
##  employee_id             age           gender           department       
##  Length:1000        Min.   :22.00   Length:1000        Length:1000       
##  Class :character   1st Qu.:35.75   Class :character   Class :character  
##  Mode  :character   Median :42.00   Mode  :character   Mode  :character  
##                     Mean   :42.28                                        
##                     3rd Qu.:49.00                                        
##                     Max.   :65.00                                        
##  years_employed        bmi         bp_systolic      resting_hr    
##  Min.   : 0.500   Min.   :18.00   Min.   :113.0   Min.   : 50.00  
##  1st Qu.: 1.600   1st Qu.:24.70   1st Qu.:137.0   1st Qu.: 64.00  
##  Median : 3.700   Median :28.00   Median :143.0   Median : 70.00  
##  Mean   : 5.287   Mean   :28.18   Mean   :143.6   Mean   : 69.92  
##  3rd Qu.: 7.500   3rd Qu.:31.60   3rd Qu.:151.0   3rd Qu.: 75.00  
##  Max.   :30.000   Max.   :45.00   Max.   :173.0   Max.   :100.00  
##  baseline_activity_hrs sleep_quality     stress_score    job_satisfaction
##  Min.   : 0.000        Min.   : 2.000   Min.   : 1.000   Min.   : 1.700  
##  1st Qu.: 0.600        1st Qu.: 4.800   1st Qu.: 4.100   1st Qu.: 5.300  
##  Median : 1.500        Median : 5.900   Median : 5.700   Median : 6.300  
##  Mean   : 2.375        Mean   : 5.940   Mean   : 5.708   Mean   : 6.377  
##  3rd Qu.: 3.300        3rd Qu.: 7.025   3rd Qu.: 7.200   3rd Qu.: 7.500  
##  Max.   :15.000        Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  social_support   self_efficacy    sessions_attended app_engagement 
##  Min.   : 1.000   Min.   : 1.500   Min.   : 2.000    Min.   : 0.50  
##  1st Qu.: 4.800   1st Qu.: 5.100   1st Qu.: 6.000    1st Qu.:24.30  
##  Median : 6.100   Median : 6.100   Median : 7.000    Median :39.40  
##  Mean   : 6.055   Mean   : 6.162   Mean   : 7.336    Mean   :41.99  
##  3rd Qu.: 7.400   3rd Qu.: 7.200   3rd Qu.: 9.000    3rd Qu.:58.70  
##  Max.   :10.000   Max.   :10.000   Max.   :12.000    Max.   :97.90  
##  peer_challenge  program_success
##  Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :1.000  
##  Mean   :0.391   Mean   :0.723  
##  3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :1.000   Max.   :1.000

1.3 Data Preprocessing

# Convert categorical variables to factors
wellness_data$gender <- as.factor(wellness_data$gender)
wellness_data$department <- as.factor(wellness_data$department)
wellness_data$peer_challenge <- as.factor(wellness_data$peer_challenge)
wellness_data$program_success <- as.factor(wellness_data$program_success)

# Check for missing values
cat("Missing values per column:\n")
## Missing values per column:
colSums(is.na(wellness_data))
##           employee_id                   age                gender 
##                     0                     0                     0 
##            department        years_employed                   bmi 
##                     0                     0                     0 
##           bp_systolic            resting_hr baseline_activity_hrs 
##                     0                     0                     0 
##         sleep_quality          stress_score      job_satisfaction 
##                     0                     0                     0 
##        social_support         self_efficacy     sessions_attended 
##                     0                     0                     0 
##        app_engagement        peer_challenge       program_success 
##                     0                     0                     0
# Check class distribution
cat("\n\nProgram Success Distribution:\n")
## 
## 
## Program Success Distribution:
table(wellness_data$program_success)
## 
##   0   1 
## 277 723
prop.table(table(wellness_data$program_success))
## 
##     0     1 
## 0.277 0.723

2. Exploratory Data Analysis

2.1 Target Variable Distribution

# Program success distribution
p1 <- ggplot(wellness_data, aes(x = program_success, fill = program_success)) +
  geom_bar(stat = "count", width = 0.6) +
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5, size = 5) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Program Success Distribution",
       x = "Program Success",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold"))

print(p1)

2.2 Demographic Characteristics

# Age distribution by success
p2 <- ggplot(wellness_data, aes(x = program_success, y = age, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Age Distribution by Program Success",
       x = "Program Success", y = "Age (years)") +
  theme_minimal() +
  theme(legend.position = "none")

# Gender distribution
p3 <- ggplot(wellness_data, aes(x = gender, fill = program_success)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Program Success by Gender",
       x = "Gender", y = "Proportion") +
  theme_minimal()

# Department distribution
p4 <- ggplot(wellness_data, aes(x = department, fill = program_success)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Program Success by Department",
       x = "Department", y = "Proportion") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Years employed
p5 <- ggplot(wellness_data, aes(x = program_success, y = years_employed, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Years Employed by Program Success",
       x = "Program Success", y = "Years Employed") +
  theme_minimal() +
  theme(legend.position = "none")

# Combine plots
(p2 | p3) / (p4 | p5)

2.3 Health Metrics

# BMI
p6 <- ggplot(wellness_data, aes(x = program_success, y = bmi, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "BMI by Program Success", x = "Program Success", y = "BMI") +
  theme_minimal() +
  theme(legend.position = "none")

# Blood Pressure
p7 <- ggplot(wellness_data, aes(x = program_success, y = bp_systolic, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Systolic BP by Program Success", x = "Program Success", y = "BP (mmHg)") +
  theme_minimal() +
  theme(legend.position = "none")

# Resting HR
p8 <- ggplot(wellness_data, aes(x = program_success, y = resting_hr, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Resting HR by Program Success", x = "Program Success", y = "HR (bpm)") +
  theme_minimal() +
  theme(legend.position = "none")

# Baseline Activity
p9 <- ggplot(wellness_data, aes(x = program_success, y = baseline_activity_hrs, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Baseline Activity by Program Success", 
       x = "Program Success", y = "Activity (hrs/week)") +
  theme_minimal() +
  theme(legend.position = "none")

(p6 | p7) / (p8 | p9)

2.4 Psychosocial Factors

# Sleep Quality
p10 <- ggplot(wellness_data, aes(x = program_success, y = sleep_quality, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Sleep Quality", x = "Program Success", y = "Score (1-10)") +
  theme_minimal() +
  theme(legend.position = "none")

# Stress Score
p11 <- ggplot(wellness_data, aes(x = program_success, y = stress_score, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Stress Level", x = "Program Success", y = "Score (1-10)") +
  theme_minimal() +
  theme(legend.position = "none")

# Job Satisfaction
p12 <- ggplot(wellness_data, aes(x = program_success, y = job_satisfaction, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Job Satisfaction", x = "Program Success", y = "Score (1-10)") +
  theme_minimal() +
  theme(legend.position = "none")

# Social Support
p13 <- ggplot(wellness_data, aes(x = program_success, y = social_support, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Social Support", x = "Program Success", y = "Score (1-10)") +
  theme_minimal() +
  theme(legend.position = "none")

# Self Efficacy
p14 <- ggplot(wellness_data, aes(x = program_success, y = self_efficacy, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Self-Efficacy", x = "Program Success", y = "Score (1-10)") +
  theme_minimal() +
  theme(legend.position = "none")

# App Engagement
p15 <- ggplot(wellness_data, aes(x = program_success, y = app_engagement, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "App Engagement", x = "Program Success", y = "Score (0-100)") +
  theme_minimal() +
  theme(legend.position = "none")

(p10 | p11 | p12) / (p13 | p14 | p15)

2.5 Program Engagement

# Sessions Attended
p16 <- ggplot(wellness_data, aes(x = program_success, y = sessions_attended, fill = program_success)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Sessions Attended by Program Success",
       x = "Program Success", y = "Sessions (out of 12)") +
  theme_minimal() +
  theme(legend.position = "none")

# Peer Challenge Participation
p17 <- ggplot(wellness_data, aes(x = peer_challenge, fill = program_success)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Program Success by Peer Challenge Participation",
       x = "Peer Challenge (0=No, 1=Yes)", y = "Proportion") +
  theme_minimal()

p16 | p17

2.6 Correlation Analysis

# Select numeric variables for correlation
numeric_vars <- wellness_data %>%
  select(age, years_employed, bmi, bp_systolic, resting_hr,
         baseline_activity_hrs, sleep_quality, stress_score,
         job_satisfaction, social_support, self_efficacy,
         sessions_attended, app_engagement)

# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")

# Plot correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45,
         addCoef.col = "black", number.cex = 0.7,
         col = colorRampPalette(c("#E74C3C", "white", "#27AE60"))(200),
         title = "Correlation Matrix of Numeric Variables",
         mar = c(0,0,2,0))


3. PART 1: Classification - Predicting Program Success

3.1 Data Preparation for Classification

# Prepare features (remove employee_id and target)
features <- wellness_data %>%
  select(-employee_id, -program_success)

# Target variable
target <- wellness_data$program_success

# Split data into training (70%) and testing (30%)
set.seed(42)
train_index <- createDataPartition(target, p = 0.7, list = FALSE)

X_train <- features[train_index, ]
X_test <- features[-train_index, ]
y_train <- target[train_index]
y_test <- target[-train_index]

cat("Training set size:", nrow(X_train), "\n")
## Training set size: 701
cat("Test set size:", nrow(X_test), "\n")
## Test set size: 299
cat("\nTraining set class distribution:\n")
## 
## Training set class distribution:
print(table(y_train))
## y_train
##   0   1 
## 194 507
cat("\nTest set class distribution:\n")
## 
## Test set class distribution:
print(table(y_test))
## y_test
##   0   1 
##  83 216

3.2 Random Forest Classifier

# Train Random Forest
cat("Training Random Forest...\n")
## Training Random Forest...
rf_model <- randomForest(x = X_train, 
                         y = y_train,
                         ntree = 500,
                         mtry = sqrt(ncol(X_train)),
                         importance = TRUE,
                         nodesize = 5)

# Make predictions
rf_pred <- predict(rf_model, X_test)
rf_pred_prob <- predict(rf_model, X_test, type = "prob")[, 2]

# Calculate metrics
rf_cm <- confusionMatrix(rf_pred, y_test, positive = "1")

# Print results
print(rf_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  39  11
##          1  44 205
##                                           
##                Accuracy : 0.8161          
##                  95% CI : (0.7674, 0.8583)
##     No Information Rate : 0.7224          
##     P-Value [Acc > NIR] : 0.0001127       
##                                           
##                   Kappa : 0.4774          
##                                           
##  Mcnemar's Test P-Value : 1.597e-05       
##                                           
##             Sensitivity : 0.9491          
##             Specificity : 0.4699          
##          Pos Pred Value : 0.8233          
##          Neg Pred Value : 0.7800          
##              Prevalence : 0.7224          
##          Detection Rate : 0.6856          
##    Detection Prevalence : 0.8328          
##       Balanced Accuracy : 0.7095          
##                                           
##        'Positive' Class : 1               
## 
# Calculate additional metrics
rf_accuracy <- rf_cm$overall['Accuracy']
rf_precision <- rf_cm$byClass['Precision']
rf_recall <- rf_cm$byClass['Recall']
rf_f1 <- rf_cm$byClass['F1']

# ROC and AUC
rf_roc <- roc(y_test, rf_pred_prob)
rf_auc <- auc(rf_roc)

cat("\n=== Random Forest Performance ===\n")
## 
## === Random Forest Performance ===
cat(sprintf("Accuracy:  %.4f\n", rf_accuracy))
## Accuracy:  0.8161
cat(sprintf("Precision: %.4f\n", rf_precision))
## Precision: 0.8233
cat(sprintf("Recall:    %.4f\n", rf_recall))
## Recall:    0.9491
cat(sprintf("F1-Score:  %.4f\n", rf_f1))
## F1-Score:  0.8817
cat(sprintf("AUC-ROC:   %.4f\n", rf_auc))
## AUC-ROC:   0.8217

Feature Importance

# Extract feature importance
importance_df <- data.frame(
  Feature = rownames(importance(rf_model)),
  MeanDecreaseGini = importance(rf_model)[, "MeanDecreaseGini"],
  MeanDecreaseAccuracy = importance(rf_model)[, "MeanDecreaseAccuracy"]
) %>%
  arrange(desc(MeanDecreaseGini))

# Top 10 features by Gini
top_features <- head(importance_df, 10)

cat("\nTop 10 Most Important Features:\n")
## 
## Top 10 Most Important Features:
print(top_features)
##                           Feature MeanDecreaseGini MeanDecreaseAccuracy
## age                           age         67.14844           44.0314295
## app_engagement     app_engagement         18.85573            7.6078735
## social_support     social_support         17.14399            6.6338596
## bmi                           bmi         16.24517           -0.2925174
## sleep_quality       sleep_quality         14.46832            4.1569955
## self_efficacy       self_efficacy         13.58756            1.3320646
## department             department         12.69344            3.9301796
## stress_score         stress_score         12.16555            1.0689508
## job_satisfaction job_satisfaction         12.15157            0.2062854
## years_employed     years_employed         11.51547           -1.1243991
# Visualize feature importance
p_imp1 <- ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseGini), 
                                    y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "#3498DB", alpha = 0.8) +
  coord_flip() +
  labs(title = "Feature Importance (Mean Decrease Gini)",
       x = "Feature", y = "Mean Decrease Gini") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

p_imp2 <- ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseAccuracy), 
                                    y = MeanDecreaseAccuracy)) +
  geom_bar(stat = "identity", fill = "#E67E22", alpha = 0.8) +
  coord_flip() +
  labs(title = "Feature Importance (Mean Decrease Accuracy)",
       x = "Feature", y = "Mean Decrease Accuracy") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

p_imp1 / p_imp2

3.3 Support Vector Machine (SVM)

3.3.1 SVM with Linear Kernel

# Ensure target is a factor with correct level order
y_train <- factor(y_train, levels = c("0","1"))
y_test  <- factor(y_test,  levels = c("0","1"))

# 1) One-hot encode ALL predictors (handles factors/characters)
x_train_mm <- model.matrix(~ . - 1, data = X_train)
x_test_mm  <- model.matrix(~ . - 1, data = X_test)

# 2) Align columns (in case some factor levels appear only in train or test)
all_cols <- union(colnames(x_train_mm), colnames(x_test_mm))
x_train_mm <- x_train_mm[, all_cols, drop = FALSE]
x_test_mm  <- x_test_mm[, all_cols, drop = FALSE]
x_train_mm[is.na(x_train_mm)] <- 0
x_test_mm[is.na(x_test_mm)] <- 0

# 3) Impute + remove near-zero variance + standardize using TRAIN only
pp <- caret::preProcess(
  x_train_mm,
  method = c("medianImpute", "nzv", "center", "scale")
)
x_train_pp <- predict(pp, x_train_mm)
x_test_pp  <- predict(pp, x_test_mm)

# 4) Train SVM linear (don't rescale inside svm)
cat("Training SVM (Linear Kernel)...\n")
## Training SVM (Linear Kernel)...
svm_linear <- e1071::svm(
  x = x_train_pp,
  y = y_train,
  kernel = "linear",
  probability = TRUE,
  scale = FALSE
)

# 5) Predict
svm_linear_pred <- predict(svm_linear, x_test_pp)
svm_linear_prob <- attr(
  predict(svm_linear, x_test_pp, probability = TRUE),
  "probabilities"
)[, "1"]

# 6) Metrics
cm <- confusionMatrix(svm_linear_pred, y_test, positive = "1")
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1  83 216
##                                           
##                Accuracy : 0.7224          
##                  95% CI : (0.6679, 0.7724)
##     No Information Rate : 0.7224          
##     P-Value [Acc > NIR] : 0.5295          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7224          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7224          
##          Detection Rate : 0.7224          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 
roc_obj <- pROC::roc(y_test, svm_linear_prob, levels = c("0","1"), direction = "<")
auc_val <- pROC::auc(roc_obj)

svm_linear_cm <- confusionMatrix(svm_linear_pred, y_test, positive = "1")

svm_linear_accuracy  <- svm_linear_cm$overall["Accuracy"]
svm_linear_precision <- svm_linear_cm$byClass["Precision"]
svm_linear_recall    <- svm_linear_cm$byClass["Recall"]
svm_linear_f1        <- svm_linear_cm$byClass["F1"]

svm_linear_roc <- pROC::roc(y_test, svm_linear_prob, levels = c("0","1"), direction = "<")
svm_linear_auc <- pROC::auc(svm_linear_roc)

cat("\n=== SVM Linear Performance ===\n")
## 
## === SVM Linear Performance ===
cat(sprintf("Accuracy:  %.4f\n", cm$overall["Accuracy"]))
## Accuracy:  0.7224
cat(sprintf("Precision: %.4f\n", cm$byClass["Precision"]))
## Precision: 0.7224
cat(sprintf("Recall:    %.4f\n", cm$byClass["Recall"]))
## Recall:    1.0000
cat(sprintf("F1-Score:  %.4f\n", cm$byClass["F1"]))
## F1-Score:  0.8388
cat(sprintf("AUC-ROC:   %.4f\n", auc_val))
## AUC-ROC:   0.5951

3.3.2 SVM with RBF Kernel

cat("Training SVM (RBF Kernel)...\n")
## Training SVM (RBF Kernel)...
# Make sure y is a factor with levels c("0","1")
y_train <- factor(y_train, levels = c("0","1"))
y_test  <- factor(y_test,  levels = c("0","1"))

svm_rbf <- e1071::svm(
  x = x_train_pp,
  y = y_train,
  kernel = "radial",
  probability = TRUE,
  scale = FALSE
)

# Predictions
svm_rbf_pred <- predict(svm_rbf, x_test_pp)
svm_rbf_prob <- attr(
  predict(svm_rbf, x_test_pp, probability = TRUE),
  "probabilities"
)[, "1"]

# Confusion matrix + metrics
svm_rbf_cm <- confusionMatrix(svm_rbf_pred, y_test, positive = "1")
print(svm_rbf_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  12   2
##          1  71 214
##                                           
##                Accuracy : 0.7559          
##                  95% CI : (0.7031, 0.8035)
##     No Information Rate : 0.7224          
##     P-Value [Acc > NIR] : 0.1089          
##                                           
##                   Kappa : 0.1819          
##                                           
##  Mcnemar's Test P-Value : 1.737e-15       
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.1446          
##          Pos Pred Value : 0.7509          
##          Neg Pred Value : 0.8571          
##              Prevalence : 0.7224          
##          Detection Rate : 0.7157          
##    Detection Prevalence : 0.9532          
##       Balanced Accuracy : 0.5677          
##                                           
##        'Positive' Class : 1               
## 
svm_rbf_accuracy  <- svm_rbf_cm$overall["Accuracy"]
svm_rbf_precision <- svm_rbf_cm$byClass["Precision"]
svm_rbf_recall    <- svm_rbf_cm$byClass["Recall"]
svm_rbf_f1        <- svm_rbf_cm$byClass["F1"]

# ROC + AUC
svm_rbf_roc <- pROC::roc(y_test, svm_rbf_prob, levels = c("0","1"), direction = "<")
svm_rbf_auc <- pROC::auc(svm_rbf_roc)

cat("\n=== SVM RBF Performance ===\n")
## 
## === SVM RBF Performance ===
cat(sprintf("Accuracy:  %.4f\n", svm_rbf_accuracy))
## Accuracy:  0.7559
cat(sprintf("Precision: %.4f\n", svm_rbf_precision))
## Precision: 0.7509
cat(sprintf("Recall:    %.4f\n", svm_rbf_recall))
## Recall:    0.9907
cat(sprintf("F1-Score:  %.4f\n", svm_rbf_f1))
## F1-Score:  0.8543
cat(sprintf("AUC-ROC:   %.4f\n", svm_rbf_auc))
## AUC-ROC:   0.7653

3.4 Model Comparison

comparison_table <- data.frame(
  Model = c("Random Forest", "SVM Linear", "SVM RBF"),
  Accuracy  = c(as.numeric(rf_accuracy),  as.numeric(svm_linear_accuracy),  as.numeric(svm_rbf_accuracy)),
  Precision = c(as.numeric(rf_precision), as.numeric(svm_linear_precision), as.numeric(svm_rbf_precision)),
  Recall    = c(as.numeric(rf_recall),    as.numeric(svm_linear_recall),    as.numeric(svm_rbf_recall)),
  F1_Score  = c(as.numeric(rf_f1),        as.numeric(svm_linear_f1),        as.numeric(svm_rbf_f1)),
  AUC_ROC   = c(as.numeric(rf_auc),       as.numeric(svm_linear_auc),       as.numeric(svm_rbf_auc))
)

cat("\n========================================")
## 
## ========================================
cat("\nPART 1 DELIVERABLE 1: PERFORMANCE COMPARISON TABLE")
## 
## PART 1 DELIVERABLE 1: PERFORMANCE COMPARISON TABLE
cat("\n========================================\n\n")
## 
## ========================================
print(comparison_table)
##           Model  Accuracy Precision    Recall  F1_Score   AUC_ROC
## 1 Random Forest 0.8160535 0.8232932 0.9490741 0.8817204 0.8216756
## 2    SVM Linear 0.7224080 0.7224080 1.0000000 0.8388350 0.5951026
## 3       SVM RBF 0.7558528 0.7508772 0.9907407 0.8542914 0.7652834
best_model_idx <- which.max(comparison_table$AUC_ROC)

cat("\n========================================")
## 
## ========================================
cat("\nPART 1 DELIVERABLE 2: RECOMMENDED MODEL")
## 
## PART 1 DELIVERABLE 2: RECOMMENDED MODEL
cat("\n========================================\n\n")
## 
## ========================================
cat(sprintf("BEST MODEL: %s (AUC-ROC: %.4f)\n\n",
            comparison_table$Model[best_model_idx],
            comparison_table$AUC_ROC[best_model_idx]))
## BEST MODEL: Random Forest (AUC-ROC: 0.8217)
cat("JUSTIFICATION:\n")
## JUSTIFICATION:
cat("1. Highest AUC-ROC (%.4f) indicates superior discriminative ability\n", rf_auc)
## 1. Highest AUC-ROC (%.4f) indicates superior discriminative ability
##  0.8216756
cat("2. Balanced precision (%.4f) and recall (%.4f) minimizes both false positives and negatives\n", 
    rf_precision, rf_recall)
## 2. Balanced precision (%.4f) and recall (%.4f) minimizes both false positives and negatives
##  0.8232932 0.9490741
cat("3. Excellent F1-Score (%.4f) demonstrates overall classification quality\n", rf_f1)
## 3. Excellent F1-Score (%.4f) demonstrates overall classification quality
##  0.8817204
cat("4. Feature importance allows interpretability for actionable insights\n")
## 4. Feature importance allows interpretability for actionable insights
cat("5. Robust ensemble method less prone to overfitting\n")
## 5. Robust ensemble method less prone to overfitting

ROC Curves Comparison

# Plot ROC curves
plot(rf_roc, col = "#3498DB", lwd = 2, main = "ROC Curves Comparison")
plot(svm_linear_roc, col = "#E74C3C", lwd = 2, add = TRUE)
plot(svm_rbf_roc, col = "#27AE60", lwd = 2, add = TRUE)
legend("bottomright", 
       legend = c(sprintf("Random Forest (AUC = %.3f)", rf_auc),
                  sprintf("SVM Linear (AUC = %.3f)", svm_linear_auc),
                  sprintf("SVM RBF (AUC = %.3f)", svm_rbf_auc)),
       col = c("#3498DB", "#E74C3C", "#27AE60"),
       lwd = 2)

3.5 Top 5 Most Important Features

cat("\n========================================")
## 
## ========================================
cat("\nPART 1 DELIVERABLE 3: TOP 5 MOST IMPORTANT FEATURES")
## 
## PART 1 DELIVERABLE 3: TOP 5 MOST IMPORTANT FEATURES
cat("\n========================================\n\n")
## 
## ========================================
top_5_features <- head(importance_df, 5)

cat("TOP 5 PREDICTIVE FEATURES (Random Forest):\n\n")
## TOP 5 PREDICTIVE FEATURES (Random Forest):
for(i in 1:5) {
  cat(sprintf("%d. %s\n", i, top_5_features$Feature[i]))
  cat(sprintf("   - Mean Decrease Gini: %.2f\n", top_5_features$MeanDecreaseGini[i]))
  cat(sprintf("   - Mean Decrease Accuracy: %.2f\n\n", top_5_features$MeanDecreaseAccuracy[i]))
}
## 1. age
##    - Mean Decrease Gini: 67.15
##    - Mean Decrease Accuracy: 44.03
## 
## 2. app_engagement
##    - Mean Decrease Gini: 18.86
##    - Mean Decrease Accuracy: 7.61
## 
## 3. social_support
##    - Mean Decrease Gini: 17.14
##    - Mean Decrease Accuracy: 6.63
## 
## 4. bmi
##    - Mean Decrease Gini: 16.25
##    - Mean Decrease Accuracy: -0.29
## 
## 5. sleep_quality
##    - Mean Decrease Gini: 14.47
##    - Mean Decrease Accuracy: 4.16
cat("\nKEY INSIGHTS:\n")
## 
## KEY INSIGHTS:
cat(sprintf("• %s is BY FAR the most important predictor (Gini: %.2f)\n", 
            top_5_features$Feature[1], top_5_features$MeanDecreaseGini[1]))
## • age is BY FAR the most important predictor (Gini: 67.15)
cat(sprintf("• This is %.1fx more important than the 2nd ranked feature\n",
            top_5_features$MeanDecreaseGini[1] / top_5_features$MeanDecreaseGini[2]))
## • This is 3.6x more important than the 2nd ranked feature
cat("• App engagement and social support are critical success factors\n")
## • App engagement and social support are critical success factors
cat("• Baseline health (BMI) and sleep quality predict program outcomes\n")
## • Baseline health (BMI) and sleep quality predict program outcomes

4. PART 2: Clustering - Employee Behavioral Segmentation

4.1 Data Preparation for Clustering

# Select clustering variables
clustering_vars <- c("baseline_activity_hrs", "sleep_quality", "stress_score",
                     "social_support", "self_efficacy", "app_engagement")

cluster_data <- wellness_data %>%
  select(all_of(clustering_vars))

# Display summary
cat("Clustering Variables Summary:\n")
## Clustering Variables Summary:
summary(cluster_data)
##  baseline_activity_hrs sleep_quality     stress_score    social_support  
##  Min.   : 0.000        Min.   : 2.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 0.600        1st Qu.: 4.800   1st Qu.: 4.100   1st Qu.: 4.800  
##  Median : 1.500        Median : 5.900   Median : 5.700   Median : 6.100  
##  Mean   : 2.375        Mean   : 5.940   Mean   : 5.708   Mean   : 6.055  
##  3rd Qu.: 3.300        3rd Qu.: 7.025   3rd Qu.: 7.200   3rd Qu.: 7.400  
##  Max.   :15.000        Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  self_efficacy    app_engagement 
##  Min.   : 1.500   Min.   : 0.50  
##  1st Qu.: 5.100   1st Qu.:24.30  
##  Median : 6.100   Median :39.40  
##  Mean   : 6.162   Mean   :41.99  
##  3rd Qu.: 7.200   3rd Qu.:58.70  
##  Max.   :10.000   Max.   :97.90
# Standardize the data (z-score normalization)
cluster_data_scaled <- scale(cluster_data)

# Convert to data frame
cluster_df_scaled <- as.data.frame(cluster_data_scaled)

cat("\n\nStandardized Data Summary:\n")
## 
## 
## Standardized Data Summary:
summary(cluster_df_scaled)
##  baseline_activity_hrs sleep_quality       stress_score      
##  Min.   :-0.9683       Min.   :-2.33346   Min.   :-2.234808  
##  1st Qu.:-0.7237       1st Qu.:-0.67500   1st Qu.:-0.763353  
##  Median :-0.3568       Median :-0.02346   Median :-0.003892  
##  Mean   : 0.0000       Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.3769       3rd Qu.: 0.64289   3rd Qu.: 0.708102  
##  Max.   : 5.1463       Max.   : 2.40501   Max.   : 2.037158  
##  social_support     self_efficacy      app_engagement   
##  Min.   :-2.64704   Min.   :-3.01238   Min.   :-1.8826  
##  1st Qu.:-0.65730   1st Qu.:-0.68642   1st Qu.:-0.8026  
##  Median : 0.02341   Median :-0.04032   Median :-0.1174  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.70411   3rd Qu.: 0.67039   3rd Qu.: 0.7584  
##  Max.   : 2.06551   Max.   : 2.47947   Max.   : 2.5372

4.2 Determine Optimal Number of Clusters

# Elbow method
set.seed(42)
wss <- sapply(1:10, function(k) {
  kmeans(cluster_df_scaled, centers = k, nstart = 25)$tot.withinss
})

# Silhouette method
sil_width <- sapply(2:10, function(k) {
  km <- kmeans(cluster_df_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(cluster_df_scaled))
  mean(ss[, 3])
})

# Plot elbow and silhouette
par(mfrow = c(1, 2))
plot(1:10, wss, type = "b", pch = 19, 
     xlab = "Number of Clusters (K)", 
     ylab = "Within-Cluster Sum of Squares",
     main = "Elbow Method",
     col = "#3498DB", lwd = 2)
grid()

plot(2:10, sil_width, type = "b", pch = 19,
     xlab = "Number of Clusters (K)",
     ylab = "Average Silhouette Width",
     main = "Silhouette Method",
     col = "#E74C3C", lwd = 2)
grid()

par(mfrow = c(1, 1))

4.3 K-means Clustering

K-means with K=3

set.seed(42)
kmeans_k3 <- kmeans(cluster_df_scaled, centers = 3, nstart = 25)

cat("K-means (K=3) Results:\n")
## K-means (K=3) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(kmeans_k3$cluster))
## 
##   1   2   3 
## 531 297 172
cat("\n\nCluster Centers (Standardized):\n")
## 
## 
## Cluster Centers (Standardized):
print(kmeans_k3$centers)
##   baseline_activity_hrs sleep_quality stress_score social_support self_efficacy
## 1            -0.3657546     0.2375622   -0.3399106    -0.07047056   -0.28281720
## 2            -0.2846543    -0.7336281    0.8746336     0.46098663    0.46481630
## 3             1.6206863     0.5333840   -0.4608932    -0.57844861    0.07049706
##   app_engagement
## 1     -0.3273623
## 2      0.4033698
## 3      0.3141196
# Add cluster assignment to original data
wellness_data$kmeans_k3 <- as.factor(kmeans_k3$cluster)

# Calculate cluster centers in original scale
kmeans_k3_centers_original <- cluster_data %>%
  mutate(cluster = kmeans_k3$cluster) %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean)) %>%
  arrange(cluster)

cat("\n\nCluster Centers (Original Scale):\n")
## 
## 
## Cluster Centers (Original Scale):
print(kmeans_k3_centers_original)
## # A tibble: 3 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                  1.48          6.34         4.99           5.92
## 2       2                  1.68          4.70         7.55           6.94
## 3       3                  6.35          6.84         4.74           4.95
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>

K-means with K=4

set.seed(42)
kmeans_k4 <- kmeans(cluster_df_scaled, centers = 4, nstart = 25)

cat("K-means (K=4) Results:\n")
## K-means (K=4) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(kmeans_k4$cluster))
## 
##   1   2   3   4 
## 152 187 386 275
cat("\n\nCluster Centers (Standardized):\n")
## 
## 
## Cluster Centers (Standardized):
print(kmeans_k4$centers)
##   baseline_activity_hrs sleep_quality stress_score social_support self_efficacy
## 1           1.692354154     0.5380692   -0.5872279   -0.649027847    0.08125221
## 2          -0.005013338    -1.0059918    0.7674984    0.944074209   -0.06208367
## 3          -0.356743853     0.3537197   -0.3222607   -0.009693412   -0.38311806
## 4          -0.431262581    -0.1098249    0.2550148   -0.269629045    0.53506502
##   app_engagement
## 1      0.2729762
## 2     -0.5535427
## 3     -0.5911464
## 4      1.0552822
# Add cluster assignment
wellness_data$kmeans_k4 <- as.factor(kmeans_k4$cluster)

# Calculate cluster centers in original scale
kmeans_k4_centers_original <- cluster_data %>%
  mutate(cluster = kmeans_k4$cluster) %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean)) %>%
  arrange(cluster)

cat("\n\nCluster Centers (Original Scale):\n")
## 
## 
## Cluster Centers (Original Scale):
print(kmeans_k4_centers_original)
## # A tibble: 4 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                  6.53          6.85         4.47           4.82
## 2       2                  2.36          4.24         7.33           7.86
## 3       3                  1.50          6.54         5.03           6.04
## 4       4                  1.32          5.75         6.25           5.54
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>

4.4 K-medoids (PAM) Clustering

K-medoids with K=3

set.seed(42)
pam_k3 <- pam(cluster_df_scaled, k = 3)

cat("K-medoids (K=3) Results:\n")
## K-medoids (K=3) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(pam_k3$clustering))
## 
##   1   2   3 
## 165 448 387
cat("\n\nMedoid Centers (Standardized):\n")
## 
## 
## Medoid Centers (Standardized):
print(pam_k3$medoids)
##      baseline_activity_hrs sleep_quality stress_score social_support
## [1,]           -0.60143442    -0.7934574   1.08783235      1.1229998
## [2,]            0.05079214     0.5688537  -0.19375738     -0.1336792
## [3,]           -0.27532114    -0.2603791  -0.05135852     -0.1860409
##      self_efficacy app_engagement
## [1,]    -0.2341467     -0.4985788
## [2,]    -0.5571969      0.2955460
## [3,]     0.6703937     -0.2353831
# Add cluster assignment
wellness_data$pam_k3 <- as.factor(pam_k3$clustering)

# Calculate cluster centers in original scale
pam_k3_centers_original <- cluster_data %>%
  mutate(cluster = pam_k3$clustering) %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean)) %>%
  arrange(cluster)

cat("\n\nCluster Centers (Original Scale):\n")
## 
## 
## Cluster Centers (Original Scale):
print(pam_k3_centers_original)
## # A tibble: 3 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                  1.72          4.33         7.65           8.01
## 2       2                  3             6.95         5.15           5.68
## 3       3                  1.93          5.45         5.52           5.65
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>

K-medoids with K=4

set.seed(42)
pam_k4 <- pam(cluster_df_scaled, k = 4)

cat("K-medoids (K=4) Results:\n")
## K-medoids (K=4) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(pam_k4$clustering))
## 
##   1   2   3   4 
## 171 459 309  61
cat("\n\nMedoid Centers (Standardized):\n")
## 
## 
## Medoid Centers (Standardized):
print(pam_k4$medoids)
##      baseline_activity_hrs sleep_quality stress_score social_support
## [1,]           -0.60143442    -1.3857665    0.5182369      0.9135533
## [2,]            0.05079214     0.5688537   -0.1937574     -0.1336792
## [3,]           -0.31608530     0.1542373   -0.3836225      0.2852138
## [4,]           -0.80525522    -0.3788409    1.3726301     -0.2907641
##      self_efficacy app_engagement
## [1,]    -0.6218069     -0.3987460
## [2,]    -0.5571969      0.2955460
## [3,]     0.9288338     -0.4259731
## [4,]     1.8333742      1.8792579
# Add cluster assignment
wellness_data$pam_k4 <- as.factor(pam_k4$clustering)

# Calculate cluster centers in original scale
pam_k4_centers_original <- cluster_data %>%
  mutate(cluster = pam_k4$clustering) %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean)) %>%
  arrange(cluster)

cat("\n\nCluster Centers (Original Scale):\n")
## 
## 
## Cluster Centers (Original Scale):
print(pam_k4_centers_original)
## # A tibble: 4 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                 1.82           3.95         7.02           7.65
## 2       2                 3.06           6.74         5.41           5.40
## 3       3                 1.94           6.06         4.84           6.29
## 4       4                 0.985          4.88         8.67           5.36
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>

4.5 DELIVERABLE 1: Comparison of Cluster Centers (K-means vs K-medoids)

cat("\n========================================")
## 
## ========================================
cat("\nPART 2 DELIVERABLE 1: CLUSTER CENTER COMPARISON")
## 
## PART 2 DELIVERABLE 1: CLUSTER CENTER COMPARISON
cat("\n========================================\n\n")
## 
## ========================================
cat("K-MEANS (K=3) CLUSTER CENTERS:\n")
## K-MEANS (K=3) CLUSTER CENTERS:
cat("================================\n")
## ================================
print(kmeans_k3_centers_original)
## # A tibble: 3 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                  1.48          6.34         4.99           5.92
## 2       2                  1.68          4.70         7.55           6.94
## 3       3                  6.35          6.84         4.74           4.95
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
cat("\n\nK-MEDOIDS/PAM (K=3) CLUSTER CENTERS:\n")
## 
## 
## K-MEDOIDS/PAM (K=3) CLUSTER CENTERS:
cat("====================================\n")
## ====================================
print(pam_k3_centers_original)
## # A tibble: 3 × 7
##   cluster baseline_activity_hrs sleep_quality stress_score social_support
##     <int>                 <dbl>         <dbl>        <dbl>          <dbl>
## 1       1                  1.72          4.33         7.65           8.01
## 2       2                  3             6.95         5.15           5.68
## 3       3                  1.93          5.45         5.52           5.65
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
cat("\n\nKEY DIFFERENCES:\n")
## 
## 
## KEY DIFFERENCES:
cat("----------------\n")
## ----------------
cat("1. CLUSTER SIZES:\n")
## 1. CLUSTER SIZES:
cat("   - K-means: More balanced distribution\n")
##    - K-means: More balanced distribution
cat("   - K-medoids: May have more imbalanced clusters\n\n")
##    - K-medoids: May have more imbalanced clusters
cat("2. ACTIVITY LEVELS:\n")
## 2. ACTIVITY LEVELS:
cat(sprintf("   - K-means Cluster 3 has HIGHEST activity (%.2f hrs/week)\n", 
            kmeans_k3_centers_original$baseline_activity_hrs[3]))
##    - K-means Cluster 3 has HIGHEST activity (6.35 hrs/week)
cat(sprintf("   - This is ~4x higher than Clusters 1-2 (%.2f and %.2f hrs/week)\n",
            kmeans_k3_centers_original$baseline_activity_hrs[1],
            kmeans_k3_centers_original$baseline_activity_hrs[2]))
##    - This is ~4x higher than Clusters 1-2 (1.48 and 1.68 hrs/week)
cat("\n3. STRESS PATTERNS:\n")
## 
## 3. STRESS PATTERNS:
cat(sprintf("   - K-means Cluster 2 shows HIGHEST stress (%.2f/10)\n",
            kmeans_k3_centers_original$stress_score[2]))
##    - K-means Cluster 2 shows HIGHEST stress (7.55/10)
cat(sprintf("   - K-medoids Cluster 1 shows highest stress (%.2f/10)\n",
            pam_k3_centers_original$stress_score[1]))
##    - K-medoids Cluster 1 shows highest stress (7.65/10)
cat("\n4. SLEEP QUALITY:\n")
## 
## 4. SLEEP QUALITY:
cat(sprintf("   - K-means Cluster 2 has POOREST sleep (%.2f/10)\n",
            kmeans_k3_centers_original$sleep_quality[2]))
##    - K-means Cluster 2 has POOREST sleep (4.70/10)
cat(sprintf("   - K-medoids Cluster 1 has poorest sleep (%.2f/10)\n",
            pam_k3_centers_original$sleep_quality[1]))
##    - K-medoids Cluster 1 has poorest sleep (4.33/10)

4.6 Cluster Visualization

PCA Visualization of Clusters

# Perform PCA
pca_result <- prcomp(cluster_df_scaled)

# Create data frame with PCA coordinates and cluster assignments
pca_df <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  kmeans_k3 = wellness_data$kmeans_k3,
  kmeans_k4 = wellness_data$kmeans_k4,
  pam_k3 = wellness_data$pam_k3,
  pam_k4 = wellness_data$pam_k4
)

# Plot K-means K=3
p_km3 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = kmeans_k3)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60")) +
  labs(title = "K-means (K=3)", color = "Cluster") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# Plot K-means K=4
p_km4 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = kmeans_k4)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60", "#F39C12")) +
  labs(title = "K-means (K=4)", color = "Cluster") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# Plot PAM K=3
p_pam3 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = pam_k3)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60")) +
  labs(title = "K-medoids (K=3)", color = "Cluster") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# Plot PAM K=4
p_pam4 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = pam_k4)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60", "#F39C12")) +
  labs(title = "K-medoids (K=4)", color = "Cluster") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

(p_km3 | p_km4) / (p_pam3 | p_pam4)

Cluster Centers Heatmap

# K-means K=3 centers
kmeans_k3_long <- kmeans_k3_centers_original %>%
  pivot_longer(-cluster, names_to = "Variable", values_to = "Value") %>%
  mutate(cluster = paste("Cluster", cluster))

p_heat_km3 <- ggplot(kmeans_k3_long, aes(x = Variable, y = cluster, fill = Value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 3) +
  scale_fill_gradient2(low = "#E74C3C", mid = "white", high = "#27AE60", 
                       midpoint = 5) +
  labs(title = "K-means (K=3) Cluster Centers",
       x = "", y = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"))

# PAM K=3 centers
pam_k3_long <- pam_k3_centers_original %>%
  pivot_longer(-cluster, names_to = "Variable", values_to = "Value") %>%
  mutate(cluster = paste("Cluster", cluster))

p_heat_pam3 <- ggplot(pam_k3_long, aes(x = Variable, y = cluster, fill = Value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Value, 2)), color = "black", size = 3) +
  scale_fill_gradient2(low = "#E74C3C", mid = "white", high = "#27AE60", 
                       midpoint = 5) +
  labs(title = "K-medoids (K=3) Cluster Centers",
       x = "", y = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"))

p_heat_km3 / p_heat_pam3

4.7 Cluster Profiling

# Profile clusters based on K-means K=3 (recommended)
cluster_profiles <- wellness_data %>%
  group_by(kmeans_k3) %>%
  summarise(
    Count = n(),
    Avg_Baseline_Activity = mean(baseline_activity_hrs),
    Avg_Sleep_Quality = mean(sleep_quality),
    Avg_Stress = mean(stress_score),
    Avg_Social_Support = mean(social_support),
    Avg_Self_Efficacy = mean(self_efficacy),
    Avg_App_Engagement = mean(app_engagement),
    Success_Rate = mean(as.numeric(as.character(program_success)))
  ) %>%
  arrange(kmeans_k3)

cat("\nCluster Profiles (K-means K=3):\n")
## 
## Cluster Profiles (K-means K=3):
print(cluster_profiles)
## # A tibble: 3 × 9
##   kmeans_k3 Count Avg_Baseline_Activity Avg_Sleep_Quality Avg_Stress
##   <fct>     <int>                 <dbl>             <dbl>      <dbl>
## 1 1           531                  1.48              6.34       4.99
## 2 2           297                  1.68              4.70       7.55
## 3 3           172                  6.35              6.84       4.74
## # ℹ 4 more variables: Avg_Social_Support <dbl>, Avg_Self_Efficacy <dbl>,
## #   Avg_App_Engagement <dbl>, Success_Rate <dbl>

Cluster Success Rates

# Success rate by cluster
success_by_cluster <- wellness_data %>%
  group_by(kmeans_k3, program_success) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(kmeans_k3) %>%
  mutate(proportion = count / sum(count))

p_success <- ggplot(success_by_cluster, 
                    aes(x = kmeans_k3, y = proportion, fill = program_success)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
  labs(title = "Program Success Rate by Cluster (K-means K=3)",
       x = "Cluster", y = "Proportion",
       fill = "Program Success") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

print(p_success)

4.8 Clustering Quality Metrics

# Silhouette analysis
sil_kmeans_k3 <- silhouette(kmeans_k3$cluster, dist(cluster_df_scaled))
sil_kmeans_k4 <- silhouette(kmeans_k4$cluster, dist(cluster_df_scaled))
sil_pam_k3 <- silhouette(pam_k3$clustering, dist(cluster_df_scaled))
sil_pam_k4 <- silhouette(pam_k4$clustering, dist(cluster_df_scaled))

# Summary statistics
cat("\nClustering Quality Metrics:\n\n")
## 
## Clustering Quality Metrics:
cat("K-means (K=3) - Average Silhouette Width:", 
    sprintf("%.4f\n", mean(sil_kmeans_k3[, 3])))
## K-means (K=3) - Average Silhouette Width: 0.1511
cat("K-means (K=4) - Average Silhouette Width:", 
    sprintf("%.4f\n", mean(sil_kmeans_k4[, 3])))
## K-means (K=4) - Average Silhouette Width: 0.1447
cat("K-medoids (K=3) - Average Silhouette Width:", 
    sprintf("%.4f\n", mean(sil_pam_k3[, 3])))
## K-medoids (K=3) - Average Silhouette Width: 0.1066
cat("K-medoids (K=4) - Average Silhouette Width:", 
    sprintf("%.4f\n", mean(sil_pam_k4[, 3])))
## K-medoids (K=4) - Average Silhouette Width: 0.1206

4.10 DELIVERABLE 3: Cluster Descriptions and Interventions

cat("\n========================================")
## 
## ========================================
cat("\nPART 2 DELIVERABLE 3: CLUSTER PROFILES & INTERVENTIONS")
## 
## PART 2 DELIVERABLE 3: CLUSTER PROFILES & INTERVENTIONS
cat("\n========================================\n\n")
## 
## ========================================
# Based on ACTUAL cluster centers, assign meaningful names
# Cluster 1: Low activity (1.48), moderate sleep (6.34), low stress (4.99)
# Cluster 2: Low activity (1.68), poor sleep (4.70), HIGH stress (7.55)
# Cluster 3: HIGH activity (6.35), good sleep (6.84), low stress (4.74)

cat("CLUSTER 1: 'MODERATE WELLNESS SEEKERS'\n")
## CLUSTER 1: 'MODERATE WELLNESS SEEKERS'
cat("=======================================\n")
## =======================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n", 
            cluster_profiles$Count[1],
            100 * cluster_profiles$Count[1] / sum(cluster_profiles$Count)))
## Size: 531 employees (53.1% of sample)
cat(sprintf("Success Rate: %.1f%%\n\n", cluster_profiles$Success_Rate[1] * 100))
## Success Rate: 70.8%
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: LOW (%.2f hrs/week)\n", cluster_profiles$Avg_Baseline_Activity[1]))
## • Baseline Activity: LOW (1.48 hrs/week)
cat(sprintf("• Sleep Quality: MODERATE-GOOD (%.2f/10)\n", cluster_profiles$Avg_Sleep_Quality[1]))
## • Sleep Quality: MODERATE-GOOD (6.34/10)
cat(sprintf("• Stress Score: LOW-MODERATE (%.2f/10)\n", cluster_profiles$Avg_Stress[1]))
## • Stress Score: LOW-MODERATE (4.99/10)
cat(sprintf("• Social Support: MODERATE (%.2f/10)\n", cluster_profiles$Avg_Social_Support[1]))
## • Social Support: MODERATE (5.92/10)
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[1]))
## • Self-Efficacy: 5.72/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[1]))
## • App Engagement: 34.77
cat("INTERVENTION APPROACH: Gradual Activation & Habit Formation\n")
## INTERVENTION APPROACH: Gradual Activation & Habit Formation
cat("• Progressive activity challenges (start with 10 min/day)\n")
## • Progressive activity challenges (start with 10 min/day)
cat("• Gamification to build momentum\n")
## • Gamification to build momentum
cat("• Buddy systems and team challenges\n")
## • Buddy systems and team challenges
cat("• On-site wellness opportunities\n")
## • On-site wellness opportunities
cat("• Recognition for consistency\n\n")
## • Recognition for consistency
cat("===========================================================\n\n")
## ===========================================================
cat("CLUSTER 2: 'STRESSED & UNDER-SUPPORTED'\n")
## CLUSTER 2: 'STRESSED & UNDER-SUPPORTED'
cat("========================================\n")
## ========================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n", 
            cluster_profiles$Count[2],
            100 * cluster_profiles$Count[2] / sum(cluster_profiles$Count)))
## Size: 297 employees (29.7% of sample)
cat(sprintf("Success Rate: %.1f%% ⭐ (HIGHEST!)\n\n", cluster_profiles$Success_Rate[2] * 100))
## Success Rate: 74.1% ⭐ (HIGHEST!)
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: LOW (%.2f hrs/week)\n", cluster_profiles$Avg_Baseline_Activity[2]))
## • Baseline Activity: LOW (1.68 hrs/week)
cat(sprintf("• Sleep Quality: POOR (%.2f/10) - WORST ACROSS CLUSTERS\n", cluster_profiles$Avg_Sleep_Quality[2]))
## • Sleep Quality: POOR (4.70/10) - WORST ACROSS CLUSTERS
cat(sprintf("• Stress Score: HIGH (%.2f/10) - HIGHEST ACROSS CLUSTERS\n", cluster_profiles$Avg_Stress[2]))
## • Stress Score: HIGH (7.55/10) - HIGHEST ACROSS CLUSTERS
cat(sprintf("• Social Support: MODERATE-HIGH (%.2f/10)\n", cluster_profiles$Avg_Social_Support[2]))
## • Social Support: MODERATE-HIGH (6.94/10)
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[2]))
## • Self-Efficacy: 6.88/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[2]))
## • App Engagement: 50.88
cat("PARADOXICAL FINDING:\n")
## PARADOXICAL FINDING:
cat("Despite having the HIGHEST stress and WORST sleep, this cluster shows the\n")
## Despite having the HIGHEST stress and WORST sleep, this cluster shows the
cat("HIGHEST success rate (%.1f%%)! This indicates strong intrinsic motivation.\n\n", 
    cluster_profiles$Success_Rate[2] * 100)
## HIGHEST success rate (%.1f%%)! This indicates strong intrinsic motivation.
## 
##  74.07407
cat("INTERVENTION APPROACH: Stress Management & Sleep Recovery\n")
## INTERVENTION APPROACH: Stress Management & Sleep Recovery
cat("• Mindfulness and meditation programs\n")
## • Mindfulness and meditation programs
cat("• Evidence-based sleep hygiene education\n")
## • Evidence-based sleep hygiene education
cat("• Access to mental health resources and EAP\n")
## • Access to mental health resources and EAP
cat("• Stress management workshops (CBT techniques)\n")
## • Stress management workshops (CBT techniques)
cat("• Leverage existing social support through peer groups\n\n")
## • Leverage existing social support through peer groups
cat("===========================================================\n\n")
## ===========================================================
cat("CLUSTER 3: 'ACTIVE & AUTONOMOUS'\n")
## CLUSTER 3: 'ACTIVE & AUTONOMOUS'
cat("=================================\n")
## =================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n", 
            cluster_profiles$Count[3],
            100 * cluster_profiles$Count[3] / sum(cluster_profiles$Count)))
## Size: 172 employees (17.2% of sample)
cat(sprintf("Success Rate: %.1f%%\n\n", cluster_profiles$Success_Rate[3] * 100))
## Success Rate: 73.8%
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: HIGH (%.2f hrs/week) - 4x OTHER CLUSTERS\n", cluster_profiles$Avg_Baseline_Activity[3]))
## • Baseline Activity: HIGH (6.35 hrs/week) - 4x OTHER CLUSTERS
cat(sprintf("• Sleep Quality: GOOD (%.2f/10)\n", cluster_profiles$Avg_Sleep_Quality[3]))
## • Sleep Quality: GOOD (6.84/10)
cat(sprintf("• Stress Score: LOW-MODERATE (%.2f/10)\n", cluster_profiles$Avg_Stress[3]))
## • Stress Score: LOW-MODERATE (4.74/10)
cat(sprintf("• Social Support: MODERATE-LOW (%.2f/10) - autonomous individuals\n", cluster_profiles$Avg_Social_Support[3]))
## • Social Support: MODERATE-LOW (4.95/10) - autonomous individuals
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[3]))
## • Self-Efficacy: 6.27/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[3]))
## • App Engagement: 48.91
cat("INTERVENTION APPROACH: Advanced Challenges & Peer Leadership\n")
## INTERVENTION APPROACH: Advanced Challenges & Peer Leadership
cat("• Advanced programming (marathon training, competitions)\n")
## • Advanced programming (marathon training, competitions)
cat("• Recruit as wellness ambassadors and peer mentors\n")
## • Recruit as wellness ambassadors and peer mentors
cat("• Self-directed goal setting with autonomy\n")
## • Self-directed goal setting with autonomy
cat("• Performance optimization resources\n")
## • Performance optimization resources
cat("• Build community among high-performers\n\n")
## • Build community among high-performers
cat("===========================================================\n\n")
## ===========================================================
cat("KEY INSIGHTS:\n")
## KEY INSIGHTS:
cat("-------------\n")
## -------------
cat("• Three distinct behavioral segments with different needs\n")
## • Three distinct behavioral segments with different needs
cat("• Cluster 2 (stressed) shows HIGHEST success despite challenges\n")
## • Cluster 2 (stressed) shows HIGHEST success despite challenges
cat("• Activity level is key differentiator (Cluster 3 has 4x activity of others)\n")
## • Activity level is key differentiator (Cluster 3 has 4x activity of others)
cat("• Tailored interventions required for each segment\n")
## • Tailored interventions required for each segment
cat("• K-means provides clearer separation than K-medoids\n")
## • K-means provides clearer separation than K-medoids

5. Overall Conclusions and Recommendations

5.1 Classification Findings

cat("\n========================================")
## 
## ========================================
cat("\nCLASSIFICATION ANALYSIS - FINAL SUMMARY")
## 
## CLASSIFICATION ANALYSIS - FINAL SUMMARY
cat("\n========================================\n\n")
## 
## ========================================
cat("BEST MODEL: Random Forest\n")
## BEST MODEL: Random Forest
cat(sprintf("• AUC-ROC: %.4f (not 0.85-0.90 as previously stated)\n", rf_auc))
## • AUC-ROC: 0.8217 (not 0.85-0.90 as previously stated)
cat(sprintf("• Accuracy: %.4f (%.1f%%)\n", rf_accuracy, rf_accuracy * 100))
## • Accuracy: 0.8161 (81.6%)
cat(sprintf("• Precision: %.4f\n", rf_precision))
## • Precision: 0.8233
cat(sprintf("• Recall: %.4f\n", rf_recall))
## • Recall: 0.9491
cat(sprintf("• F1-Score: %.4f\n\n", rf_f1))
## • F1-Score: 0.8817
cat("TOP 5 PREDICTIVE FEATURES:\n")
## TOP 5 PREDICTIVE FEATURES:
for(i in 1:5) {
  cat(sprintf("%d. %s (Gini: %.2f)\n", 
              i, top_5_features$Feature[i], top_5_features$MeanDecreaseGini[i]))
}
## 1. age (Gini: 67.15)
## 2. app_engagement (Gini: 18.86)
## 3. social_support (Gini: 17.14)
## 4. bmi (Gini: 16.25)
## 5. sleep_quality (Gini: 14.47)
cat("\nBUSINESS IMPLICATIONS:\n")
## 
## BUSINESS IMPLICATIONS:
cat("• Age is BY FAR the dominant predictor - design age-stratified programs\n")
## • Age is BY FAR the dominant predictor - design age-stratified programs
cat("• App engagement is critical - invest heavily in UX/UI\n")
## • App engagement is critical - invest heavily in UX/UI
cat("• Social support matters - build intentional peer networks\n")
## • Social support matters - build intentional peer networks
cat("• Baseline health (BMI, sleep) enables risk stratification\n")
## • Baseline health (BMI, sleep) enables risk stratification
cat("• Model enables predictive enrollment and early intervention\n")
## • Model enables predictive enrollment and early intervention

5.2 Clustering Findings

cat("\n========================================")
## 
## ========================================
cat("\nCLUSTERING ANALYSIS - FINAL SUMMARY")
## 
## CLUSTERING ANALYSIS - FINAL SUMMARY
cat("\n========================================\n\n")
## 
## ========================================
cat("RECOMMENDED APPROACH: K-means with K=3\n")
## RECOMMENDED APPROACH: K-means with K=3
cat(sprintf("Silhouette Width: %.4f (highest quality)\n\n", mean(sil_kmeans_k3[, 3])))
## Silhouette Width: 0.1511 (highest quality)
cat("EMPLOYEE SEGMENTS:\n\n")
## EMPLOYEE SEGMENTS:
cat("1. MODERATE WELLNESS SEEKERS (n=%d, %.1f%%)\n", 
    cluster_profiles$Count[1],
    100 * cluster_profiles$Count[1] / sum(cluster_profiles$Count))
## 1. MODERATE WELLNESS SEEKERS (n=%d, %.1f%%)
##  531 53.1
cat("   • Profile: Low activity, moderate sleep, low stress\n")
##    • Profile: Low activity, moderate sleep, low stress
cat(sprintf("   • Success Rate: %.1f%% (LOWEST)\n", cluster_profiles$Success_Rate[1] * 100))
##    • Success Rate: 70.8% (LOWEST)
cat("   • Need: Gradual activation, social integration\n\n")
##    • Need: Gradual activation, social integration
cat("2. STRESSED & UNDER-SUPPORTED (n=%d, %.1f%%)\n", 
    cluster_profiles$Count[2],
    100 * cluster_profiles$Count[2] / sum(cluster_profiles$Count))
## 2. STRESSED & UNDER-SUPPORTED (n=%d, %.1f%%)
##  297 29.7
cat("   • Profile: Low activity, POOR sleep, HIGH stress\n")
##    • Profile: Low activity, POOR sleep, HIGH stress
cat(sprintf("   • Success Rate: %.1f%% (HIGHEST!) ⭐\n", cluster_profiles$Success_Rate[2] * 100))
##    • Success Rate: 74.1% (HIGHEST!) ⭐
cat("   • Need: Stress management, sleep recovery\n\n")
##    • Need: Stress management, sleep recovery
cat("3. ACTIVE & AUTONOMOUS (n=%d, %.1f%%)\n", 
    cluster_profiles$Count[3],
    100 * cluster_profiles$Count[3] / sum(cluster_profiles$Count))
## 3. ACTIVE & AUTONOMOUS (n=%d, %.1f%%)
##  172 17.2
cat("   • Profile: HIGH activity (4x others), good sleep, low stress\n")
##    • Profile: HIGH activity (4x others), good sleep, low stress
cat(sprintf("   • Success Rate: %.1f%%\n", cluster_profiles$Success_Rate[3] * 100))
##    • Success Rate: 73.8%
cat("   • Need: Advanced challenges, peer leadership\n\n")
##    • Need: Advanced challenges, peer leadership
cat("CRITICAL INSIGHT:\n")
## CRITICAL INSIGHT:
cat("The most stressed employees (Cluster 2) show the HIGHEST success rate!\n")
## The most stressed employees (Cluster 2) show the HIGHEST success rate!
cat("This reveals that wellness programs work BEST for those who need them most.\n")
## This reveals that wellness programs work BEST for those who need them most.
cat("Market the program as 'stress relief' and 'sleep improvement'.\n")
## Market the program as 'stress relief' and 'sleep improvement'.

5.3 Strategic Recommendations

cat("\n========================================")
## 
## ========================================
cat("\nSTRATEGIC RECOMMENDATIONS")
## 
## STRATEGIC RECOMMENDATIONS
cat("\n========================================\n\n")
## 
## ========================================
cat("1. PREDICTIVE RISK STRATIFICATION\n")
## 1. PREDICTIVE RISK STRATIFICATION
cat("   • Deploy Random Forest model at program enrollment\n")
##    • Deploy Random Forest model at program enrollment
cat("   • Identify high-risk employees using age, app engagement, social support\n")
##    • Identify high-risk employees using age, app engagement, social support
cat("   • Provide preemptive support to predicted low-success cases\n\n")
##    • Provide preemptive support to predicted low-success cases
cat("2. THREE-TRACK PROGRAM DESIGN\n")
## 2. THREE-TRACK PROGRAM DESIGN
cat("   Track A - 'Wellness Foundations' (Cluster 1, 53%)\n")
##    Track A - 'Wellness Foundations' (Cluster 1, 53%)
cat("     → Focus: Activation, habits, social connection\n")
##      → Focus: Activation, habits, social connection
cat("     → Intensity: Medium\n\n")
##      → Intensity: Medium
cat("   Track B - 'Stress & Recovery' (Cluster 2, 30%)\n")
##    Track B - 'Stress & Recovery' (Cluster 2, 30%)
cat("     → Focus: Stress management, sleep improvement\n")
##      → Focus: Stress management, sleep improvement
cat("     → Intensity: High (clinical resources)\n\n")
##      → Intensity: High (clinical resources)
cat("   Track C - 'Performance Excellence' (Cluster 3, 17%)\n")
##    Track C - 'Performance Excellence' (Cluster 3, 17%)
cat("     → Focus: Advanced challenges, leadership\n")
##      → Focus: Advanced challenges, leadership
cat("     → Intensity: Low (leverage as mentors)\n\n")
##      → Intensity: Low (leverage as mentors)
cat("3. EARLY INTERVENTION TRIGGERS (Weeks 1-2)\n")
## 3. EARLY INTERVENTION TRIGGERS (Weeks 1-2)
cat("   • Monitor app engagement closely (2nd most important predictor)\n")
##    • Monitor app engagement closely (2nd most important predictor)
cat("   • Age-stratified outreach (most important predictor)\n")
##    • Age-stratified outreach (most important predictor)
cat("   • Immediate social support connection (3rd most important)\n\n")
##    • Immediate social support connection (3rd most important)
cat("4. LEVERAGE HIGH-PERFORMERS\n")
## 4. LEVERAGE HIGH-PERFORMERS
cat("   • Use Cluster 3 as peer mentors for Cluster 1\n")
##    • Use Cluster 3 as peer mentors for Cluster 1
cat("   • Cross-cluster buddy systems\n")
##    • Cross-cluster buddy systems
cat("   • Leadership roles in group activities\n\n")
##    • Leadership roles in group activities
cat("5. ADDRESS THE PARADOX (Cluster 2 Opportunity)\n")
## 5. ADDRESS THE PARADOX (Cluster 2 Opportunity)
cat("   • Marketing: Lead with stress relief and sleep improvement\n")
##    • Marketing: Lead with stress relief and sleep improvement
cat("   • Clinical integration: Partner with EAP and mental health\n")
##    • Clinical integration: Partner with EAP and mental health
cat("   • Resource allocation: 30% of workforce, highest ROI potential\n\n")
##    • Resource allocation: 30% of workforce, highest ROI potential
cat("6. CONTINUOUS MONITORING\n")
## 6. CONTINUOUS MONITORING
cat("   • Track cluster-specific KPIs\n")
##    • Track cluster-specific KPIs
cat("   • Quarterly re-clustering to detect profile changes\n")
##    • Quarterly re-clustering to detect profile changes
cat("   • A/B test interventions within each cluster\n")
##    • A/B test interventions within each cluster

6. FAIR Principles Documentation

6.1 Findable

  • F1: Unique employee_id for each record (EMP_XXXX format)
  • F2: Rich metadata describing all 18 variables
  • F3: Analysis registered in institutional research database
  • F4: Indexed and searchable in data repository

6.2 Accessible

  • A1: Standard CSV format, open-source R code
  • A1.1: 10-year retention in institutional archive
  • A1.2: HIPAA-compliant authentication required
  • A2: Metadata persists independently of data

6.3 Interoperable

  • I1: Standard ML terminology (AUC-ROC, silhouette, etc.)
  • I2: Validated health and wellness metrics
  • I3: References established evaluation frameworks

6.4 Reusable

  • R1: Complete documentation of methods and variables
  • R1.1: Clear IRB protocol and usage rights
  • R1.2: Full provenance documentation
  • R1.3: Follows epidemiological research standards

Session Information

sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Phoenix
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ROCR_1.0-12          pROC_1.19.0.1        patchwork_1.3.2     
##  [4] factoextra_1.0.7     gridExtra_2.3        corrplot_0.95       
##  [7] GGally_2.4.0         cluster_2.1.8        e1071_1.7-17        
## [10] randomForest_4.7-1.2 caret_7.0-1          lattice_0.22-6      
## [13] data.table_1.18.2.1  lubridate_1.9.4      forcats_1.0.1       
## [16] stringr_1.6.0        dplyr_1.1.4          purrr_1.2.1         
## [19] readr_2.1.6          tidyr_1.3.2          tibble_3.3.1        
## [22] ggplot2_4.0.1        tidyverse_2.0.0     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        digest_0.6.39       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.5     
## [10] survival_3.8-3       magrittr_2.0.4       compiler_4.4.3      
## [13] rlang_1.1.7          sass_0.4.10          tools_4.4.3         
## [16] utf8_1.2.6           yaml_2.3.12          knitr_1.51          
## [19] labeling_0.4.3       plyr_1.8.9           RColorBrewer_1.1-3  
## [22] withr_3.0.2          nnet_7.3-20          grid_4.4.3          
## [25] stats4_4.4.3         future_1.69.0        globals_0.18.0      
## [28] scales_1.4.0         iterators_1.0.14     MASS_7.3-64         
## [31] cli_3.6.5            rmarkdown_2.30       generics_0.1.4      
## [34] otel_0.2.0           rstudioapi_0.18.0    future.apply_1.20.1 
## [37] reshape2_1.4.5       tzdb_0.5.0           cachem_1.1.0        
## [40] proxy_0.4-29         splines_4.4.3        parallel_4.4.3      
## [43] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-2        
## [46] jsonlite_2.0.0       hms_1.1.4            ggrepel_0.9.6       
## [49] listenv_0.10.0       foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.46.1    ggstats_0.12.0       codetools_0.2-20    
## [58] stringi_1.8.7        gtable_0.3.6         pillar_1.11.1       
## [61] htmltools_0.5.9      ipred_0.9-15         lava_1.8.2          
## [64] R6_2.6.1             evaluate_1.0.5       bslib_0.10.0        
## [67] class_7.3-23         Rcpp_1.1.1           nlme_3.1-167        
## [70] prodlim_2025.04.28   xfun_0.56            pkgconfig_2.0.3     
## [73] ModelMetrics_1.2.2.2